cd D:\KRpatent\data\matching

/* perfect */
import delimited matched_perfect_k2dg_ver2.csv, delimiter("*") encoding(UTF-8) clear
drop v1
format kiprisid %15.0g
** drop individuals
gen ddd = round(kiprisid/100000000000,1)
drop if ddd == 4
drop ddd
sa perfect_temp, replace 
** drop if no registered patent 
use assignee, clear 
keep if regi == 1
keep kiprisid
duplicates drop 
sa registered_kiprisid, replace
merge 1:n kiprisid using perfect_temp
keep if _m == 3
drop _m 
so symbol
qui by symbol: gen dup = cond(_N==1,0,_n)
qui by symbol: egen mdup = max(dup)
drop if mdup > 5
sa perfect_temp, replace
* kiprisid:symbol = n:1 
use perfect_temp, clear 
keep if mdup == 0
drop dup mdup 
sa perfect_matched, replace
use perfect_temp, clear 
keep if mdup > 0
sa perfect_dup1, replace 
use perfect_dup1, clear
so kipris
qui by kipris: gen dup2 = cond(_N==1,0,_n)
* dup2 = 0 
keep if dup2 == 0
merge 1:n kiprisid loc1 using assg_namstand
/* assg_namstand can be obtained by running "0. namestd > nameonly_main.do" 
where the input file is "KIPRIS_cleaned.dta" 
from "1. corpnum folder > KIPRIS_cleaning.do" */
keep if _m == 3
keep kiprisid src_name symbol trg_name loc1 loc2 standard_name
ren loc2 loc2_kip
ren standard_name std_kip
format loc1 loc2_kip std_kip %20s
ren symbol Symbol
merge n:1 Symbol using dg_namstand_all
/*
dg_namstand_all can be obtained by running "0. namestd > nameonly_main.do" 
where the input file is "dg_temp.dta" 
from "1. corpnum folder > KIPRIS_cleaning.do" 
*/
keep if _m == 3
keep kiprisid src_name Symbol trg_name loc1 loc2_kip loc2 std_kip std_kip standard_name
ren loc2 loc2_dg
ren standard_name std_dg
format loc1 loc2_kip loc2_dg std_dg %20s
sa temp, replace 
** loc2 match (-> some kipris ids need harmonization) 
keep if loc2_kip == loc2_dg
ren Symbol symbol
drop loc2_kip std_kip loc2_dg std_dg
append using perfect_matched 
sa perfect_matched, replace
** perfect match of ENG std names
use temp, clear
gen temp =(loc2_kip == loc2_dg)
so Symbol
qui by Symbol: egen ddd = max(temp)
drop if ddd == 1
drop ddd temp
replace std_kip = subinstr(std_kip," COLTD","",30)
replace std_dg = subinstr(std_dg," COLTD","",30)
replace std_kip = subinstr(std_kip," CO LTD","",30)
replace std_dg = subinstr(std_dg," CO LTD","",30)
replace std_kip = subinstr(std_kip," LTD CO","",30)
replace std_dg = subinstr(std_dg," LTD CO","",30)
replace std_kip = subinstr(std_kip," CO INC","",30)
replace std_dg = subinstr(std_dg," CO INC","",30)
replace std_kip = subinstr(std_kip," CORP","",30)
replace std_dg = subinstr(std_dg," CORP","",30)
replace std_kip = subinstr(std_kip," INC","",30)
replace std_dg = subinstr(std_dg," INC","",30)
replace std_kip = subinstr(std_kip," IND","",30)
replace std_dg = subinstr(std_dg," IND","",30)
gen temp = (std_kip==std_dg)
so Symbol
qui by Symbol: egen ddd = max(temp)
keep if ddd == 1
ren Symbol symbol 
drop loc2_kip std_kip loc2_dg std_dg temp ddd
append using perfect_matched
sa perfect_matched, replace
* dup2 > 0
use perfect_dup1, clear
so kipris
qui by kipris: gen dup2 = cond(_N==1,0,_n)
keep if dup2 > 0
keep if inlist(kiprisid,119981039790,120130562078,120150183711,120100269739,119981115381,120110591361,120070487163,120110341367,119980048913,120110314616,120040233811)
drop if kiprisid == 120110591361 & symbol == "B054135"
drop if kiprisid == 120110591361 & symbol == "B058193"
drop if kiprisid == 120070487163 & symbol != "B058193"
drop if kiprisid == 120110341367 & symbol != "B064315"
drop if kiprisid == 119980048913 & symbol != "B076972"
drop dup mdup dup2
append using perfect_matched
format loc1 %20s
sa perfect_matched, replace

* harmonization 
use perfect_matched, clear 
keep kiprisid symbol 
so symbol
qui by symbol: gen dup2 = cond(_N==1,0,_n)
keep if dup2 > 0
by symbol: egen kip_harm = min(kiprisid)
format kip_harm %15.0g
keep kiprisid kip_harm
duplicates drop 
sa kip_harm, replace

* kiprisid:symbol = 1:n
use perfect_matched, clear 
merge n:1 kiprisid using kip_harm
replace kip_harm = kiprisid if kip_harm ==.
drop kiprisid _m
duplicates drop 
so kip_harm
qui by kip_harm: gen dup = cond(_N==1,0,_n)
qui by kip_harm: egen mdup = max(dup) 
keep if mdup > 0
sa perfect_dup2, replace 
use perfect_dup2, clear 
gen k = 1 if kip == 120100132477 & symbol == "B030318"
replace k = 1 if kip == 120100269739 & symbol == "B049138"
replace k = 1 if kip == 120130039874 & symbol == "B030778"
replace k = 1 if kip == 120140411397 & symbol == "B074969"
replace k = 1 if kip == 120140458937 & symbol == "B059505"
replace k = 1 if kip == 120130562078 & symbol == "B018142"
replace k = 1 if kip == 120040233811 & symbol == "B081083"
replace k = 1 if kip == 119980003198 & symbol == "A005330"
replace k = 1 if kip == 119980044203 & symbol == "B065991"
replace k = 1 if kip == 119981039790 & symbol == "B029666"
replace k = 1 if kip == 119981059479 & symbol == "A005410"
replace k = 1 if kip == 119981115381 & symbol == "B049138"
replace k = 1 if kip == 120040223560 & symbol == "B051324"
keep if k == 1
drop dup mdup k
sa temp, replace 
use perfect_matched, clear 
merge n:1 kiprisid using kip_harm
replace kip_harm = kiprisid if kip_harm ==.
drop kiprisid _m
duplicates drop 
so kip_harm
qui by kip_harm: gen dup = cond(_N==1,0,_n)
qui by kip_harm: egen mdup = max(dup) 
keep if mdup == 0 
keep kip_harm src_name symbol trg_name loc1
append using temp 
order kip_harm
sa perfect_matched, replace 



/* scorebased */
import delimited matched_scorebased_k2dg_ver2.csv, delimiter("*") encoding(UTF-8) clear
drop v1 sc v rsc
format kiprisid %15.0g
format src_stem trg_stem loc1 %20s
sa temp, replace 
import delimited matched_scorebased_dg2k_ver2.csv, delimiter("*") encoding(UTF-8) clear
drop v1 sc v rsc
ren kiprisid symbol_1
ren symbol kiprisid 
ren symbol_1 symbol 
format kiprisid %15.0g
format src_stem trg_stem loc1 %20s
merge 1:1 symbol kiprisid using temp
keep if _m == 3
drop _m 
ren kiprisid kip_harm 
keep kip_harm src_st trg_st sym loc1
sa scorebased_matched, replace 

/* CorpNum */
use corpnum_cleaned, clear 
keep if match_ph == "CorpNum" 
gen temp = substr(Sym,1,1)
encode temp, gen(stype) 
drop temp
so kiprisid
qui by kiprisid: egen mstype = min(stype)
qui by kiprisid: keep if mstype == stype
qui by kiprisid: gen dup = cond(_N==1,0,_n)
drop if dup > 0 & loc2_matched == 0 
keep kiprisid Symbol 
ren Sym symbol 
merge 1:1 kiprisid using registered_kiprisid
keep if _m == 3
drop _m
sa CorpNum, replace 

use scorebased_matched, clear
append using perfect_matched
gen phase = "score" if src_stem != ""
replace phase = "perfect" if src_st == ""
keep symbol kip_harm phase
append using CorpNum
replace kip_harm = kiprisid if kip_harm ==.
drop kiprisid
replace pha = "corpnum" if pha == ""
ren pha phase_kip 
merge 1:1 symbol using uspto_matches /* result of 1. uspto_matches.do */
gen type = "Konly" if _m == 1
replace type = "Uonly" if _m == 2
replace type = "Both" if _m == 3
drop _m
ren phase phase_uspto
order symbol kip_harm assgidH type phase_kip phase_uspto
sa final_matches, replace 

/* CorpNum Updates */
/* This part was updated on Mar 6th 2019 
=> unnecessary if you use "matching corp_num to KIPRIS ID.py" */

use dup_corpnum_cleaned, clear
gen ph = "dup"
append using one_corpnum_cleaned
replace ph ="one" if ph ==""
append using new_corpnum_cleaned
replace ph ="new" if ph ==""
keep kiprisid Symbol ph
merge 1:1 kiprisid using kip_harm 
drop if _m == 2
replace kiprisid = kip_harm if _m == 30000000
drop _m kip_harm
ren Symbol symbol 
ren kiprisid kip_harm
sa temp_update, replace 
* harmonize 
use temp_update, clear
append using final_matches
drop if type =="Uonly"
so symbol 
qui by symbol: gen dup = cond(_N==1,0,_n)
keep kip_harm symbol dup 
keep if dup > 0
ren kip_harm temp 
qui by symbol: egen kip_harm = min(temp)
format kip_harm %20.0g
keep temp kip_harm 
ren kip_harm kiprisidH 
ren temp kip_harm 
duplicates drop 
sa kip_harm2, replace 
* update 
use temp_update, clear
append using final_matches
so kip_harm sym
qui by kip_harm sym: gen dup = cond(_N==1,0,_n)
drop if dup > 0 & type == ""
drop dup
replace kip_harm = _n if kip_harm ==.
merge 1:1 kip_harm using kip_harm2
replace kiprisidH = kip_harm if kiprisidH ==.
replace phase_kip = "corpnum" if phase_kip == ""
replace type ="Konly" if type =="" & assgidH ==.
replace kiprisidH =. if kiprisidH < 1000000
keep kiprisidH assgidH symbol type phase_kip phase_uspto 
order kiprisidH assgidH symbol type phase_kip phase_uspto 
replace assgidH = 3736 if symbol == "A015760"
replace kiprisidH = 219999001385 if symbol == "A015760"
replace type = "Both" if symbol == "A015760"
replace phase_uspto = "Fperfect" if symbol == "A015760"
replace assgidH = 10 if symbol == "A060310"  
replace kiprisidH = 119981110669 if symbol == "A060310"
replace type = "Both" if symbol == "A060310"
replace phase_uspto = "Fperfect" if symbol == "A060310"
duplicates drop 
so kiprisidH symbol
qui by kiprisidH symbol: gen dup = cond(_N==1,0,_n)
drop if dup > 0 & assgidH ==.
drop dup
sa final_matches_updated, replace 
* keep only registered kiprisid for scorebased phase 
use final_matches_updated, clear 
merge n:1 kiprisidH using update_scorebased
drop if _m == 3
drop _m 
sa matching_table, replace 





